An example exploratory analysis script

Published

February 7, 2024

Setup

#load needed packages. make sure they are installed.
pacman::p_load(here, knitr, tidyverse, skimr, fpp2, tigris, plotly, gganimate, viridis, transformr)
theme_set(theme_minimal())

Load the data.

d1 <- readRDS(here('data','processed-data','processed-crime.rds'))
zips <- read_csv(here('data','raw-data','austin-zip-codes.csv'))
Rows: 54 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): the_geom, AFFGEOID10
dbl (4): Zip Code, GEOID10, ALAND10, AWATER10

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data exploration through tables

Showing a bit of code to produce and save a summary table.

summary(d1)
 Incident.Number     Highest.Offense.Description Highest.Offense.Code
 Min.   :2.004e+04   Length:2461621              Min.   : 100        
 1st Qu.:2.005e+10   Class :character            1st Qu.: 601        
 Median :2.011e+10   Mode  :character            Median :1199        
 Mean   :6.032e+10                               Mean   :1689        
 3rd Qu.:2.017e+10                               3rd Qu.:2716        
 Max.   :2.024e+12                               Max.   :8905        
                                                                     
 Family.Violence    Occurred.Date.Time               Occurred.Date       
 Length:2461621     Min.   :2003-01-01 00:00:00.00   Min.   :2003-01-01  
 Class :character   1st Qu.:2007-11-16 16:49:00.00   1st Qu.:2007-11-16  
 Mode  :character   Median :2012-05-28 23:09:00.00   Median :2012-05-28  
                    Mean   :2012-11-25 19:50:28.18   Mean   :2012-11-25  
                    3rd Qu.:2017-10-26 21:19:00.00   3rd Qu.:2017-10-26  
                    Max.   :2024-06-01 23:46:00.00   Max.   :2024-06-01  
                                                                         
 Occurred.Time     Report.Date.Time                  Report.Date        
 Length:2461621    Min.   :2002-11-29 05:30:00.00   Min.   :2002-11-29  
 Class1:hms        1st Qu.:2007-11-27 22:41:00.00   1st Qu.:2007-11-27  
 Class2:difftime   Median :2012-06-06 11:15:00.00   Median :2012-06-06  
 Mode  :numeric    Mean   :2012-12-04 16:45:39.02   Mean   :2012-12-04  
                   3rd Qu.:2017-11-05 01:56:00.00   3rd Qu.:2017-11-05  
                   Max.   :2024-06-02 01:20:00.00   Max.   :2024-06-02  
                                                                        
 Report.Time       Location.Type        Address             Zip.Code    
 Length:2461621    Length:2461621     Length:2461621     Min.   :76574  
 Class1:hms        Class :character   Class :character   1st Qu.:78717  
 Class2:difftime   Mode  :character   Mode  :character   Median :78741  
 Mode  :numeric                                          Mean   :78732  
                                                         3rd Qu.:78752  
                                                         Max.   :78759  
                                                                        
 Council.District  APD.Sector        APD.District           PRA           
 Min.   : 1.000   Length:2461621     Length:2461621     Length:2461621    
 1st Qu.: 3.000   Class :character   Class :character   Class :character  
 Median : 4.000   Mode  :character   Mode  :character   Mode  :character  
 Mean   : 4.965                                                           
 3rd Qu.: 7.000                                                           
 Max.   :10.000                                                           
 NA's   :30699                                                            
  Census.Tract      Clearance.Status   Clearance.Date       UCR.Category      
 Min.   :     1.0   Length:2461621     Min.   :2003-01-01   Length:2461621    
 1st Qu.:    15.0   Class :character   1st Qu.:2008-04-07   Class :character  
 Median :    23.2   Mode  :character   Median :2012-10-17   Mode  :character  
 Mean   :   245.4                      Mean   :2013-03-14                     
 3rd Qu.:   338.0                      3rd Qu.:2018-01-19                     
 Max.   :950800.0                      Max.   :2024-06-02                     
 NA's   :8822                          NA's   :348308                         
 Category.Description  X.coordinate      Y.coordinate         Latitude    
 Length:2461621       Min.   :      0   Min.   :       0   Min.   :30.01  
 Class :character     1st Qu.:3108421   1st Qu.:10057433   1st Qu.:30.23  
 Mode  :character     Median :3117292   Median :10073004   Median :30.28  
                      Mean   :3075787   Mean   : 9946761   Mean   :30.29  
                      3rd Qu.:3126595   3rd Qu.:10100561   3rd Qu.:30.35  
                      Max.   :3231806   Max.   :10215496   Max.   :30.67  
                                                           NA's   :32335  
   Longitude        Location        
 Min.   :-98.18   Length:2461621    
 1st Qu.:-97.76   Class :character  
 Median :-97.73   Mode  :character  
 Mean   :-97.73                     
 3rd Qu.:-97.70                     
 Max.   :-97.37                     
 NA's   :32335                      
head(d1)
  Incident.Number    Highest.Offense.Description Highest.Offense.Code
1      2013851154 SEXUAL ASSAULT OF CHILD/OBJECT                 1707
2     20161800084                RAPE OF A CHILD                  204
3      2010701921                           RAPE                  200
4     20071820003                           RAPE                  200
5     20062192048       SEXUAL ASSAULT W/ OBJECT                 1700
6     20033211543                           RAPE                  200
  Family.Violence  Occurred.Date.Time Occurred.Date Occurred.Time
1               Y 2009-01-01 00:01:00    2009-01-01      00:01:00
2               Y 2016-06-28 01:05:00    2016-06-28      01:05:00
3               Y 2010-03-04 19:15:00    2010-03-04      19:15:00
4               N 2007-07-01 12:00:00    2007-07-01      12:00:00
5               N 2006-08-07 22:28:00    2006-08-07      22:28:00
6               Y 2003-11-17 14:00:00    2003-11-17      14:00:00
     Report.Date.Time Report.Date Report.Time    Location.Type
1 2013-03-26 16:56:00  2013-03-26    16:56:00 RESIDENCE / HOME
2 2016-06-28 01:05:00  2016-06-28    01:05:00 RESIDENCE / HOME
3 2010-03-11 17:06:00  2010-03-11    17:06:00 RESIDENCE / HOME
4 2007-07-01 12:00:00  2007-07-01    12:00:00 RESIDENCE / HOME
5 2006-08-07 22:28:00  2006-08-07    22:28:00 RESIDENCE / HOME
6 2003-11-17 21:40:00  2003-11-17    21:40:00 RESIDENCE / HOME
                   Address Zip.Code Council.District APD.Sector APD.District
1      900 BLOCK E 32ND ST    78705                9         BA            1
2 6900 BLOCK BRANCHWOOD DR    78744                2         FR            8
3   400 BLOCK ANGEL OAK ST    78748                5         FR            2
4     1700 BLOCK WOOTEN DR    78757                7         ID            7
5    500 BLOCK E OLTORF ST    78704                9         DA            2
6    7300 BLOCK DANJEAN DR    78745               NA         DA            6
  PRA Census.Tract Clearance.Status Clearance.Date UCR.Category
1 348         4.00                C     2013-04-11          11C
2 530        24.41                C     2016-07-01          11A
3 542        24.38                C     2010-03-18          11A
4 247       405.00                O     2007-08-02          11A
5 479        23.23                      2006-08-22          11C
6 525      1728.00                O     2003-11-30          11A
  Category.Description X.coordinate Y.coordinate Latitude Longitude Location
1                 Rape            0            0       NA        NA         
2                 Rape            0            0       NA        NA         
3                 Rape            0            0       NA        NA         
4                 Rape            0            0       NA        NA         
5                 Rape            0            0       NA        NA         
6                 Rape            0            0       NA        NA         
str(d1)
'data.frame':   2461621 obs. of  27 variables:
 $ Incident.Number            : num  2.01e+09 2.02e+10 2.01e+09 2.01e+10 2.01e+10 ...
 $ Highest.Offense.Description: chr  "SEXUAL ASSAULT OF CHILD/OBJECT" "RAPE OF A CHILD" "RAPE" "RAPE" ...
 $ Highest.Offense.Code       : int  1707 204 200 200 1700 200 902 2703 200 2006 ...
 $ Family.Violence            : chr  "Y" "Y" "Y" "N" ...
 $ Occurred.Date.Time         : POSIXct, format: "2009-01-01 00:01:00" "2016-06-28 01:05:00" ...
 $ Occurred.Date              : Date, format: "2009-01-01" "2016-06-28" ...
 $ Occurred.Time              : 'hms' num  00:01:00 01:05:00 19:15:00 12:00:00 ...
  ..- attr(*, "units")= chr "secs"
 $ Report.Date.Time           : POSIXct, format: "2013-03-26 16:56:00" "2016-06-28 01:05:00" ...
 $ Report.Date                : Date, format: "2013-03-26" "2016-06-28" ...
 $ Report.Time                : 'hms' num  16:56:00 01:05:00 17:06:00 12:00:00 ...
  ..- attr(*, "units")= chr "secs"
 $ Location.Type              : chr  "RESIDENCE / HOME" "RESIDENCE / HOME" "RESIDENCE / HOME" "RESIDENCE / HOME" ...
 $ Address                    : chr  "900 BLOCK E 32ND ST" "6900 BLOCK BRANCHWOOD DR" "400 BLOCK ANGEL OAK ST" "1700 BLOCK WOOTEN DR" ...
 $ Zip.Code                   : int  78705 78744 78748 78757 78704 78745 78702 78759 78705 78741 ...
 $ Council.District           : int  9 2 5 7 9 NA 3 10 9 3 ...
 $ APD.Sector                 : chr  "BA" "FR" "FR" "ID" ...
 $ APD.District               : chr  "1" "8" "2" "7" ...
 $ PRA                        : chr  "348" "530" "542" "247" ...
 $ Census.Tract               : num  4 24.4 24.4 405 23.2 ...
 $ Clearance.Status           : chr  "C" "C" "C" "O" ...
 $ Clearance.Date             : Date, format: "2013-04-11" "2016-07-01" ...
 $ UCR.Category               : chr  "11C" "11A" "11A" "11A" ...
 $ Category.Description       : chr  "Rape" "Rape" "Rape" "Rape" ...
 $ X.coordinate               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Y.coordinate               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Latitude                   : num  NA NA NA NA NA NA NA NA NA NA ...
 $ Longitude                  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ Location                   : chr  "" "" "" "" ...
skim(d1)
Data summary
Name d1
Number of rows 2461621
Number of columns 27
_______________________
Column type frequency:
character 11
Date 3
difftime 2
numeric 9
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Highest.Offense.Description 0 1 3 48 0 436 0
Family.Violence 0 1 1 1 0 2 0
Location.Type 0 1 7 47 0 47 0
Address 0 1 8 74 0 246951 0
APD.Sector 0 1 2 5 0 14 0
APD.District 0 1 1 2 0 21 0
PRA 0 1 1 4 0 742 0
Clearance.Status 0 1 0 1 615856 4 0
UCR.Category 0 1 0 3 1550375 17 0
Category.Description 0 1 0 18 1550375 8 0
Location 0 1 0 27 32335 219842 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
Occurred.Date 0 1.00 2003-01-01 2024-06-01 2012-05-28 7823
Report.Date 0 1.00 2002-11-29 2024-06-02 2012-06-06 7825
Clearance.Date 348308 0.86 2003-01-01 2024-06-02 2012-10-17 7814

Variable type: difftime

skim_variable n_missing complete_rate min max median n_unique
Occurred.Time 0 1 0 secs 86340 secs 14:25:00 1440
Report.Time 0 1 0 secs 86340 secs 14:06:00 1440

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Incident.Number 0 1.00 6.031558e+10 2.896224e+11 20035.00 2.005329e+10 2.010505e+10 2.017186e+10 2.024242e+12 ▇▁▁▁▁
Highest.Offense.Code 0 1.00 1.689080e+03 1.218280e+03 100.00 6.010000e+02 1.199000e+03 2.716000e+03 8.905000e+03 ▇▅▁▁▁
Zip.Code 0 1.00 7.873243e+04 2.510000e+01 76574.00 7.871700e+04 7.874100e+04 7.875200e+04 7.875900e+04 ▁▁▁▁▇
Council.District 30699 0.99 4.960000e+00 2.840000e+00 1.00 3.000000e+00 4.000000e+00 7.000000e+00 1.000000e+01 ▅▇▃▃▅
Census.Tract 8822 1.00 2.453700e+02 3.363970e+03 1.00 1.500000e+01 2.324000e+01 3.380000e+02 9.508000e+05 ▇▁▁▁▁
X.coordinate 0 1.00 3.075787e+06 3.551571e+05 0.00 3.108421e+06 3.117292e+06 3.126595e+06 3.231806e+06 ▁▁▁▁▇
Y.coordinate 0 1.00 9.946761e+06 1.147895e+06 0.00 1.005743e+07 1.007300e+07 1.010056e+07 1.021550e+07 ▁▁▁▁▇
Latitude 32335 0.99 3.029000e+01 8.000000e-02 30.01 3.023000e+01 3.028000e+01 3.035000e+01 3.067000e+01 ▁▇▇▂▁
Longitude 32335 0.99 -9.773000e+01 5.000000e-02 -98.18 -9.776000e+01 -9.773000e+01 -9.770000e+01 -9.737000e+01 ▁▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
Occurred.Date.Time 0 1 2003-01-01 00:00:00 2024-06-01 23:46:00 2012-05-28 23:09:00 1738386
Report.Date.Time 0 1 2002-11-29 05:30:00 2024-06-02 01:20:00 2012-06-06 11:15:00 2169726
summary(zips)
   the_geom            Zip Code      AFFGEOID10           GEOID10     
 Length:54          Min.   :78610   Length:54          Min.   :78610  
 Class :character   1st Qu.:78704   Class :character   1st Qu.:78704  
 Mode  :character   Median :78730   Mode  :character   Median :78730  
                    Mean   :78716                      Mean   :78716  
                    3rd Qu.:78745                      3rd Qu.:78745  
                    Max.   :78759                      Max.   :78759  
    ALAND10             AWATER10       
 Min.   :   824603   Min.   :       0  
 1st Qu.: 19996385   1st Qu.:       0  
 Median : 33872800   Median :   45636  
 Mean   : 54514717   Mean   :  815147  
 3rd Qu.: 54926871   3rd Qu.:  541028  
 Max.   :327324441   Max.   :10884130  
head(zips)
# A tibble: 6 × 6
  the_geom                        `Zip Code` AFFGEOID10 GEOID10 ALAND10 AWATER10
  <chr>                                <dbl> <chr>        <dbl>   <dbl>    <dbl>
1 MULTIPOLYGON (((-97.5275109999…      78612 8600000US…   78612  2.43e8  1463994
2 MULTIPOLYGON (((-97.8449419999…      78745 8600000US…   78745  3.46e7        0
3 MULTIPOLYGON (((-97.7376429999…      78751 8600000US…   78751  6.21e6        0
4 MULTIPOLYGON (((-97.737174 30.…      78702 8600000US…   78702  1.29e7   491456
5 MULTIPOLYGON (((-97.749349 30.…      78741 8600000US…   78741  1.97e7   557552
6 MULTIPOLYGON (((-97.638821 30.…      78653 8600000US…   78653  2.71e8  1296973
str(zips)
spc_tbl_ [54 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ the_geom  : chr [1:54] "MULTIPOLYGON (((-97.52751099999999 30.092827999999997, -97.527214 30.094862, -97.52465099999999 30.097151999999"| __truncated__ "MULTIPOLYGON (((-97.84494199999999 30.200414, -97.839648 30.199551999999994, -97.833557 30.209342999999993, -97"| __truncated__ "MULTIPOLYGON (((-97.73764299999999 30.303493, -97.73297600000001 30.310843, -97.737571 30.313464, -97.735227 30"| __truncated__ "MULTIPOLYGON (((-97.737174 30.259136, -97.732996 30.270465, -97.730048 30.278506, -97.728487 30.278568, -97.711"| __truncated__ ...
 $ Zip Code  : num [1:54] 78612 78745 78751 78702 78741 ...
 $ AFFGEOID10: chr [1:54] "8600000US78612" "8600000US78745" "8600000US78751" "8600000US78702" ...
 $ GEOID10   : num [1:54] 78612 78745 78751 78702 78741 ...
 $ ALAND10   : num [1:54] 2.43e+08 3.46e+07 6.21e+06 1.29e+07 1.97e+07 ...
 $ AWATER10  : num [1:54] 1463994 0 0 491456 557552 ...
 - attr(*, "spec")=
  .. cols(
  ..   the_geom = col_character(),
  ..   `Zip Code` = col_double(),
  ..   AFFGEOID10 = col_character(),
  ..   GEOID10 = col_double(),
  ..   ALAND10 = col_double(),
  ..   AWATER10 = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 
skim(zips)
Data summary
Name zips
Number of rows 54
Number of columns 6
_______________________
Column type frequency:
character 2
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
the_geom 0 1 811 10887 0 54 0
AFFGEOID10 0 1 14 14 0 54 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Zip Code 0 1 78716.44 40.9 78610 78704.25 78729.5 78744.75 78759 ▁▂▁▅▇
GEOID10 0 1 78716.44 40.9 78610 78704.25 78729.5 78744.75 78759 ▁▂▁▅▇
ALAND10 0 1 54514716.76 69396455.1 824603 19996384.75 33872799.5 54926871.00 327324441 ▇▁▁▁▁
AWATER10 0 1 815146.96 2006503.2 0 0.00 45635.5 541028.00 10884130 ▇▁▁▁▁

Important to note that the dataset has several date fields, and that alone will be ripe with opportunity for exploration. However I want to take a look at some of the other variables first, to see if maybe isolating to one of the other categories, or including other categorical variables in the time series could be useful. First off is Zip Code, because location is often a major factor in determining the amount and type of crime that would occur.

d1 %>% 
  mutate(years = year(Occurred.Date),
         years = as.character(years)
         ) %>% 
  group_by(years, Zip.Code) %>% 
  summarize(
    count = n()
  ) %>% as.data.frame() %>% pivot_wider(names_from = years, values_from = count)
`summarise()` has grouped output by 'years'. You can override using the
`.groups` argument.
# A tibble: 69 × 23
   Zip.Code `2003` `2004` `2005` `2006` `2007` `2008` `2009` `2010` `2011`
      <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
 1    76574      1      4      1     NA      2      6     NA      1     NA
 2    78610      1     17     14     10      5      8      9      6      5
 3    78612      1     NA      1      3     NA      2     NA     NA     NA
 4    78613    379    435    301    343    338    470    461    442    431
 5    78617    748    756    722    744    840   1035   1038   1084   1063
 6    78620      2      1      5      2      3      1      1     NA     NA
 7    78634      2     NA     NA      1      1      1     NA     NA     NA
 8    78641     11      3      6      9      5     11      1      3     NA
 9    78642      1     NA     NA     NA     NA      1     NA     NA     NA
10    78645      4      1      5      3      4      4     NA      1     NA
# ℹ 59 more rows
# ℹ 13 more variables: `2012` <int>, `2013` <int>, `2014` <int>, `2015` <int>,
#   `2016` <int>, `2017` <int>, `2018` <int>, `2019` <int>, `2020` <int>,
#   `2021` <int>, `2022` <int>, `2023` <int>, `2024` <int>

Layering in the year gives an interesting view, because it gives us the opportunity to see major increases in certain locations over time. However, there are a lot of zip Codes and a lot of years in this dataset. A visualization may help with this, my first thought being a chloropeth plot with a slider for the year value. Other quicker solutions might be looking at the individual years as observations of sorts, then looking at summary statistics. More to come here.

d1 %>% 
  mutate(years = year(Occurred.Date)
         #years = as.character(years)
         ) %>% 
  #filter(years == '2023' | years == '2022') %>% 
  #group_by(Family.Violence, Highest.Offense.Description, years) %>% 
  group_by(Highest.Offense.Description, years) %>% 
  summarize(cnt = n(), .groups = 'drop') %>% 
  #group_by(Highest.Offense.Description, years) %>% 
  #mutate(tot = sum(cnt)) %>% 
  #relocate(tot, .after = years) %>% 
  mutate(Highest.Offense.Description = as.factor(Highest.Offense.Description)) %>%
  as.data.frame() %>%
  arrange(years) %>% 
  pivot_wider(names_from = years, values_from = cnt) 
# A tibble: 436 × 23
   Highest.Offense.Description  `2003` `2004` `2005` `2006` `2007` `2008` `2009`
   <fct>                         <int>  <int>  <int>  <int>  <int>  <int>  <int>
 1 ABUSE OF OFFICIAL CAPACITY        1      5      2      2      4      2      1
 2 AGG ASLT W/MOTOR VEH FAM/DA…     25     41     37     55     52     54     50
 3 AGG ASSAULT                     718    809    805    810    849    971    885
 4 AGG ASSAULT FAM/DATE VIOLEN…    426    446    497    564    610    628    605
 5 AGG ASSAULT ON PUBLIC SERVA…     26     25     16     25     14     11     24
 6 AGG ASSAULT WITH MOTOR VEH      119    147    104     88    109    121    102
 7 AGG FORCED SODOMY                 5      2      2      2      3      3      3
 8 AGG FORCED SODOMY OF CHILD       36     34     24     23     11     15      1
 9 AGG KIDNAPPING                    7      6      7      6      8      9      9
10 AGG PERJURY                       1     NA     NA     NA     NA     NA     NA
# ℹ 426 more rows
# ℹ 15 more variables: `2010` <int>, `2011` <int>, `2012` <int>, `2013` <int>,
#   `2014` <int>, `2015` <int>, `2016` <int>, `2017` <int>, `2018` <int>,
#   `2019` <int>, `2020` <int>, `2021` <int>, `2022` <int>, `2023` <int>,
#   `2024` <int>

Similar to our zip code variable here, but there are far too many crime descriptions to use. Grouping some together may work, I don’t think we have enough information in the dataset overall to do any sort of classification modeling however, so it may amount to a rules based classification using regular expressions. Other variables may still help with this, such as the UCF classification variables.

d1 %>% 
  mutate(
    Category.Description = if_else(
      Category.Description == '', 'NA/Unknown', Category.Description
      )
    ) %>% 
  group_by(Category.Description, Highest.Offense.Description) %>% 
  summarize(
    cnt = n()
    ) %>% 
  pivot_wider(
    names_from = Category.Description, values_from = cnt
    )
`summarise()` has grouped output by 'Category.Description'. You can override
using the `.groups` argument.
# A tibble: 436 × 9
   Highest.Offense.Description `Aggravated Assault` `Auto Theft` Burglary Murder
   <chr>                                      <int>        <int>    <int>  <int>
 1 AGG ASLT ENHANC STRANGL/SU…                 1098           NA       NA     NA
 2 AGG ASLT STRANGLE/SUFFOCATE                 7712           NA       NA     NA
 3 AGG ASLT W/MOTOR VEH FAM/D…                  772           NA       NA     NA
 4 AGG ASSAULT                                18634           NA       NA     NA
 5 AGG ASSAULT BY PUBLIC SERV…                   10           NA       NA     NA
 6 AGG ASSAULT FAM/DATE VIOLE…                 9603           NA       NA     NA
 7 AGG ASSAULT ON PEACE OFFIC…                   70           NA       NA     NA
 8 AGG ASSAULT ON PUBLIC SERV…                  329           NA       NA     NA
 9 AGG ASSAULT WITH MOTOR VEH                  1823           NA       NA     NA
10 ARSON WITH BODILY INJURY                      15           NA       NA     NA
# ℹ 426 more rows
# ℹ 4 more variables: `NA/Unknown` <int>, Rape <int>, Robbery <int>,
#   Theft <int>

The UCF classifiers are much more consumable, but there are more incidents that are not classified than are. That said, this might be a good start for a rules based classification. Many of the unclassified incidents would not fit very nicely into the groups anyway, so some new categories may need to be made. I may or may not come back to the re-classification part, but ultimately the genesis of this project was with reports on the amount of homicide in the city, so the already existing descriptions may be enough to drill down into just those.

reminder to myself to save figures from this file to bring into manuscript and presentation later.

Data exploration through figures

Time is a variable that is much easier to interpret with a plot, so let’s start there. Time series first.

d1 %>% group_by(Occurred.Date) %>% 
  summarize(
  cnt = n()
) %>% as.data.frame() %>% 
  ggplot(mapping = aes(x=Occurred.Date, y=cnt)) +
  geom_line()

I expected aggregating by day to actually look worse than this. There is a clear trend, upwards at first but then starts decreasing around 2008 or so. Seeing this I am tempted to looking into time series modeling. There may be a seasonal pattern in some of those spikes, but that may be something that can be resolved with different methods. I’m tempted to go ARIMA, because frankly that’s the one I’m familiar with, but I’ll have to do some reading up on my options. An interesting idea might be to fit the model, make predictions, and then see where I landed compared to the actual data before the class ends.

d1 %>% filter(Occurred.Date <= '2023-12-31') %>%
  mutate(Occurred.Date = trunc.Date(Occurred.Date, 'months')) %>% 
  group_by(Occurred.Date) %>% 
  summarize(
  cnt = n()
) %>% as.data.frame() %>% 
  ggplot(mapping = aes(x=Occurred.Date, y=cnt)) +
  geom_line()

Monthly looks even better, again further lending itself to the idea of a time series model. The downward spikes are more prevalent here and may cause problems themselves as well, but again the seasonality may be able to be resolved with things like logs or differencing. As a note, in this and the last plot I filter out 2024, mostly due to a large drop at the end from the incomplete month, but also for the aforementioned idea of comparing predictions to actual results.

A potentially quick way to identify “peak” months for crime would be via a bar chart aggregating all occurrences in each month together. I was hoping to see some quick trends here, but I see surprisingly little here. The highest month is May, potentially because of school letting out/ graduations for UT Austin, but I wouldnt say the difference is large enough compared to other months to really point to any one reason. This may be another one to layer in with time, like a chart with a slider for the year to see shifts in crime frequency by month over time.

d2 <- d1 %>% 
  mutate(
    Occurred.Month = month(Occurred.Date, label = TRUE),
    Occurred.Day = weekdays(Occurred.Date),
    Occurred.Year = year(Occurred.Date),
    Occur.Report.Diff = Report.Date - Occurred.Date
  )
d2 %>% 
  #filter(Occurred.Year == 2023) %>%
  ggplot(aes(x=Occurred.Month)) +
  geom_bar()

Now a Zip Code bar chart. Really a bit of a graphical representation of what we saw earlier, though I filtered it to the two most recent complete years. Again, looking for big swings, and simultaneously taking a quick peak at the Family Violence indicator. Surprisingly low volume there, which leads me to believe it just doesn’t get captured appropriately every time, especially when comparing with the descriptions themselves, which we will see in a moment. Going back to the zip codes, there might be an opportunity to do something like binning the zip codes with high medium and low crime areas, or something of the sort.

d1 %>% 
  mutate(years = year(Occurred.Date),
         years = as.character(years)
         ) %>% 
  filter(years == '2003' | years == '2022') %>% 
  group_by(Family.Violence, Zip.Code, years) %>% 
  summarize(cnt = n(), .groups = 'drop') %>% 
  group_by(Zip.Code, years) %>% 
  mutate(tot = sum(cnt)) %>% 
  relocate(tot, .after = years) %>% 
  mutate(Zip.Code = as.character(Zip.Code)) %>%
  as.data.frame() %>% 
  arrange(desc(tot)) %>% head(100) %>% 
  ggplot(aes(fill=Family.Violence, x= reorder(Zip.Code,cnt), y=cnt)) +
  geom_bar(position = 'stack', stat='identity') +
  facet_wrap(~years) +
  coord_flip()

Now a bar chart by crime description. This is similar to the last plot, but a little more disparate here. There are so many more crime descriptions than zip codes, so they are much more easily spread thin. That said, that actually makes the top five crime descriptions stand out a little more. So going back to the idea of creating new classifications, that may be a good idea to make sure those 5 are appropriately captured. And again, the family violence indicator is captured almost entirely by one description, yet the top description is Family disturbance. An easy explanation may be because the description is of the Highest Offense in the incident, so when there is physical family violence, it is often the highest offense.

d1 %>% 
  mutate(years = year(Occurred.Date),
         years = as.character(years)
         ) %>% 
  filter(years == '2023' | years == '2022') %>% 
  group_by(Family.Violence, Highest.Offense.Description, years) %>% 
  summarize(cnt = n(), .groups = 'drop') %>% 
  group_by(Highest.Offense.Description, years) %>% 
  mutate(tot = sum(cnt)) %>% 
  relocate(tot, .after = years) %>% 
  mutate(Highest.Offense.Description = as.factor(Highest.Offense.Description)) %>%
  as.data.frame() %>% 
  arrange(desc(tot)) %>% head(50) %>% 
  ggplot(aes(fill=Family.Violence, x= reorder(Highest.Offense.Description,cnt), y=cnt)) +
  geom_bar(position = 'stack', stat='identity') +
  facet_wrap(~years) +
  coord_flip() 

Notes

Including this sandbox chunk where I can experiment with code as needed. ARIMA is not going well, mostly due to seasonal nature of data. Filtering the time range might help, or isolating other variables to do things like predict certain types of crime instead.

tig_zips <- zctas(cb=TRUE, starts_with = c(unique(d1$Zip.Code)), year = 2020)
ZCTAs can take several minutes to download.  To cache the data and avoid re-downloading in future R sessions, set `options(tigris_use_cache = TRUE)`

  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |=========                                                             |  14%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |==============================                                        |  44%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |===================================                                   |  51%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%
yearlyCrime <- d1 %>% mutate(
  Zip.Char = as.character(Zip.Code),
  years = year(Occurred.Date),
  years = as.character(years)
  ) %>% filter(years != 2024) %>% 
  group_by(Zip.Char, years) %>% 
  summarize(crim.count  = n()) %>% 
  ungroup() %>% 
  as.data.frame()
`summarise()` has grouped output by 'Zip.Char'. You can override using the
`.groups` argument.
  #filter(count >= 500) %>%
  #filter(Zip.Char == '78741') %>%
  

allYrsZips <- expand.grid(years = unique(yearlyCrime$years), Zip.Char = unique(yearlyCrime$Zip.Char))

yearlyCrimeAll <- left_join(allYrsZips, yearlyCrime, by = join_by(Zip.Char, years))  %>% 
  inner_join(tig_zips, by = join_by(Zip.Char == ZCTA5CE20)) %>% 
  mutate(crim.count = ifelse(is.na(crim.count), 0, crim.count))
cpeth2 <- yearlyCrimeAll %>% #filter(crim.count >= 10) %>% 
  ggplot() +
  geom_sf(aes(geometry = geometry, fill = 0)) +
  geom_sf(aes(geometry = geometry, fill = -crim.count, frame = years, label = Zip.Char)) +
  #scale_fill_gradient(trans = 'reverse')
  scale_fill_viridis_c(limits = c(-13000, 0), oob = scales::squish, direction = -1)
Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
: Ignoring unknown aesthetics: frame and label
cpeth2

cplotly2 <- cpeth2 %>% ggplotly(originalData=FALSE) %>% animation_opts(1500, 1)
cplotly2